* Settings
version 16
do "$SSDIMed/scripts/_auxiliary/_project_settings.do"

* -----------------------------------------------------------------------------------------------
* Build dataset on economic conditions, population, in months of SSDI filing
* Inputs:
*   BLS - Unemployment rates by county and month
*   Census - CDC Wonder population counts
*   SSA DAF PUF - Counts of SSDI beneficiaries, by [filing month] x [Medicare eligibility month]
* Output:
*   Unemployment and population in months of SSDI filing, by county and SSDI Medicare eligibility month
* -----------------------------------------------------------------------------------------------

***********
* Merging DAF PUF and master cohort 
*   master cohort has: fips code, age at Medicare entry, covstart month 
*   DAFPUF has: covstart month, age at Medicare entry, month of application (for 10% sample) 
*   LAUS has: moving average number of unemployed and unemployment rate for previous 12 months in the county X yearmonth 
* Steps:
*   Step 1 (DAFPUF): for each month of Medicare entry, get distribution of months of application 
*   Step 2 (BLS & Census): county and month of Medicare entry, calculate average unemployment rate and population in months of SSDI filing


****************************
* Step 1 (DAFPUF): for each year of year of Medicare entry, get distribution of months of application 

use "$SSDIMed/data/proc/ssa/DAF18/daf18_puf_ann_dmg_sample.dta", clear

* Month of filing
gen ssdi_filing_month = mofd(bdof_puf1)
format ssdi_filing_month %tm

* Unemployment and county population counts start in 1990m1; drop prior filing months 
drop if ssdi_filing_month < mofd(mdy(1, 1, 1990))

* How many in each filing month and Medicare coverage start month
assert !missing(ssdi_filing_month, covstart_month)
gen double ssdi_benes = 1 
gcollapse (sum) ssdi_benes, by(ssdi_filing_month covstart_month) fast
label var ssdi_filing_month "SSDI month of filing entry 1 = mofd(bdof_puf1)"
label var ssdi_benes "Number of ssdi_benes per filing and Medicare entry month"

* What is the average diff?
gen diff = covstart_month - ssdi_filing_month
sum diff [aw = ssdi_benes], d
drop diff

* Sort and save
compress
sort ssdi_filing_month covstart_month
order ssdi_filing_month covstart_month
tempfile filing_covstart_cells
save `filing_covstart_cells'


****************************
* Step 2 (BLS, Census, Step 1): construct population and unemployment rates for each covstart month, based on conditions when individuals filed for SSDI
* Note: it takes about 1.5 hours to run Step 2, due to the joinby, merge, and gcollapse steps

* County unemployment rates
use "$SSDIMed/data/proc/bls/laus/bls_laus_county_monthly.dta", clear
gisid fipscounty yearmonth
assert inrange(year, 1990, 2019)
keep fipscounty yearmonth unemp_rate_county 
order fipscounty yearmonth

* Add in national rates: 20250902 CC changed this from an old filename (that was confirmed to have the same values) to the one made on 01b
merge m:1 yearmonth using "$SSDIMed/data/proc/bls/labor_force/bls_labor_force_us_monthly.dta", assert(match using) keep(match) keepusing(unemp_rate_us) nogen noreport
rename unemp_rate_us unemp_rate_national

* Join in covstart month observerations (and # of filers), by ssdi filing month
rename yearmonth ssdi_filing_month
joinby ssdi_filing_month using `filing_covstart_cells'
gisid fipscounty covstart_month ssdi_filing_month

* Add in population counts (by age and overall, in wide format, where an obs is fipscounty year)
gen year = year(dofm(ssdi_filing_month))
merge m:1 fipscounty year using "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byage_1990-2019_wide.dta", keep(match master) nogen noreport

* Add in gender-specific population counts (by age and overall, in wide format, where an obs is fipscounty year)
merge m:1 fipscounty year using "$SSDIMed/data/proc/census/popest/cdc-wonder-county-pop_byagegender_1990-2019_wide.dta", keep(match master) nogen noreport

save "$SSDIMed/data/temp/bls_census.dta", replace


* Collapse down to covstart_month level, weighting by ssdi benes applying in each month
* Note: `gcollapse` chokes with many population variables. For replicability, use it to collapse unemp* vars, since 
*   `collapse` gives tiny (machine precision) differences that result in two mismatches in q20 bins, due to edge cases. 
*   Collapse population variables using `collapse`.

** collapse unemployment
use unemp_rate_county unemp_rate_national pop_18_65 ssdi_benes fipscounty covstart_month using "$SSDIMed/data/temp/bls_census.dta", clear
gcollapse (mean) unemp_rate_county unemp_rate_national pop_18_65 [aw = ssdi_benes], by(fipscounty covstart_month) labelformat(#sourcelabel#)

* Quantiles of unemployment
foreach nq in 20 10 {
  * Population weight for unemployment distribution
  local pop_weight pop_18_65
  
  * The following code will produce missing values of xtile variable when population (weighting variable) is zero or missing
  *assert `pop_weight' > 0 & !missing(`pop_weight')
  *gquantiles unemp_rate_county [aw = `pop_weight'], xtile(unemp_rate_q`nq') nquantiles(`nq') 
  
  * Alternative
  gquantiles unemp_rate_county [aw = `pop_weight'], _pctile nquantiles(`nq') 
  local q0 0
  forvalues q = 1/`=`nq'-1' {
    local q`q' = r(r`q')
  }
  local q`nq' 100
  gen unemp_rate_q`nq' = 0
  forvalues q = 1/`nq' {
    replace unemp_rate_q`nq' = `q' if inrange(unemp_rate_county, `q`=`q'-1'', `q`q'')
  }

  * QC:
  assert !missing(unemp_rate_q`nq', unemp_rate_county)
  gstats tab unemp_rate_county [aw = `pop_weight'], by(unemp_rate_q`nq')
  order unemp_rate_q`nq', after(unemp_rate_county)
  if `nq'==10 label var unemp_rate_q`nq' "Deciles of unemp_rate_county (weighted by `pop_weight')"
  if `nq'==20 label var unemp_rate_q`nq' "Ventiles of unemp_rate_county (weighted by `pop_weight')"
}

drop pop_18_65 
tempfile collapse_unemp
save `collapse_unemp'


** collapse population, including gender-specific population counts
use pop* ssdi_benes fipscounty covstart_month year using "$SSDIMed/data/temp/bls_census.dta", clear

* QC: Population counts are missing for PR (state fips 72) only
foreach var of varlist pop18* pop60* pop_all* pop_18_60* {
  assert missing(`var') if (substr(fipscounty, 1, 2) == "72")
}

foreach v of var * {
  local l`v' : variable label `v'
  if `"`l`v''"' == "" {
    local l`v' "`v'"
  }
}
collapse (mean) pop* [aw = ssdi_benes], by(fipscounty covstart_month)
foreach v of var * {
  label var `v' "`l`v''"
}

* QC: Population counts are missing for PR (state fips 72) only
foreach var of varlist pop18* pop60* pop_all* pop_18_60* {
  di "`var'"
  assert missing(`var') if (substr(fipscounty, 1, 2) == "72")
}

tempfile collapse_pop
save `collapse_pop'


use `collapse_unemp', clear 
merge 1:1 fipscounty covstart_month using `collapse_pop', assert(match) nogen noreport

* Rename variables to reflect that they correspond to conditions in the months of application
rename unemp* unemp*_atapp
rename pop* pop*_atapp

* Sort and save
compress
bysort fipscounty covstart_month: assert _N == 1
order fipscounty covstart_month pop*_atapp
order pop*_f_atapp, last
order pop*_m_atapp, last

* version with unemp + population by gender (used in supplemental analysis) - WIDE
save "$SSDIMed/data/proc/public/unemp_popgender_atapp.dta", replace

* version with population by gender only (used in supplemental analysis) - LONG
use "$SSDIMed/data/proc/public/unemp_popgender_atapp.dta", clear
keep fipscounty covstart_month pop?_?_atapp pop??_?_atapp
rename pop*_atapp pop*
local popvars
forvalues a = 0/85 {
    local popvars `popvars' pop`a'_
}
greshape long `popvars', by(fipscounty covstart_month) keys(gender) string
rename pop*_ pop*
greshape long pop, by(fipscounty covstart_month gender) keys(age)
rename pop pop_atapp
rename age age_atapp
label var gender "gender"
label var age_atapp "Single year of age through 84 years, age85 and older"
bys fipscounty covstart_month gender age_atapp: assert _N == 1 
save "$SSDIMed/data/proc/public/popgender_atapp_long.dta", replace

* original version (no population by gender) - WIDE
* (revision on 2024-08-16 exactly matches original)
use "$SSDIMed/data/proc/public/unemp_popgender_atapp.dta", clear
drop pop*_f_atapp pop*_m_atapp
*cf _all using  "$SSDIMed/data/proc/public/unemp_pop_atapp.dta", all
save "$SSDIMed/data/proc/public/unemp_pop_atapp.dta", replace


